In [1]:
#Importing the basic librarires

import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

#from brokenaxes import brokenaxes
from statsmodels.formula import api
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10,6]

import warnings 
warnings.filterwarnings('ignore')
In [2]:
import csv
df=pd.read_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/Walmart-Sales.csv')
df.head(5)
Out[2]:
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment
0 1 5/2/10 1643690.90 0 42.31 2.572 211.096358 8.106
1 1 12/2/10 1641957.44 1 38.51 2.548 211.242170 8.106
2 1 19-02-2010 1611968.17 0 39.93 2.514 211.289143 8.106
3 1 26-02-2010 1409727.59 0 46.63 2.561 211.319643 8.106
4 1 5/3/10 1554806.68 0 46.50 2.625 211.350143 8.106
In [3]:
# Iterate over the columns in the DataFrame
for column in df.columns:
    # Check the data type of the column
    if df[column].dtype == 'object':
        # If the data type is object, the variable is categorical
        print(column, 'is a categorical variable')
    else:
        # If the data type is not object, the variable is continuous
        print(column, 'is a continuous variable')
Store is a continuous variable
Date is a categorical variable
Weekly_Sales is a continuous variable
Holiday_Flag is a continuous variable
Temperature is a continuous variable
Fuel_Price is a continuous variable
CPI is a continuous variable
Unemployment is a continuous variable
In [4]:
num_rows = len(df)
print('Number of rows:', num_rows)
print('Number of columns:', df.shape[1])

df.columns
df.info()
Number of rows: 7008
Number of columns: 8
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7008 entries, 0 to 7007
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         7008 non-null   int64  
 1   Date          7008 non-null   object 
 2   Weekly_Sales  7008 non-null   float64
 3   Holiday_Flag  7008 non-null   int64  
 4   Temperature   7008 non-null   float64
 5   Fuel_Price    7008 non-null   float64
 6   CPI           7008 non-null   float64
 7   Unemployment  7008 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 438.1+ KB
In [5]:
print(df.duplicated())
# remove duplicates and keep the first occurrence
df = df.drop_duplicates()

# print the cleaned dataframe
print(df)
num_rows = len(df)
print('Number of rows:', num_rows)

# get the number of missing rows
num_missing_rows = df.isnull().any(axis=1).sum()
print('Number of missing rows:', num_missing_rows)

# get the number of missing columns
num_missing_cols = df.isnull().any(axis=0).sum()
print('Number of missing columns:', num_missing_cols)
0       False
1       False
2       False
3       False
4       False
        ...  
7003     True
7004     True
7005     True
7006     True
7007     True
Length: 7008, dtype: bool
      Store        Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
0         1      5/2/10    1643690.90             0        42.31       2.572   
1         1     12/2/10    1641957.44             1        38.51       2.548   
2         1  19-02-2010    1611968.17             0        39.93       2.514   
3         1  26-02-2010    1409727.59             0        46.63       2.561   
4         1      5/3/10    1554806.68             0        46.50       2.625   
...     ...         ...           ...           ...          ...         ...   
6430     45  28-09-2012     713173.95             0        64.88       3.997   
6431     45     5/10/12     733455.07             0        64.89       3.985   
6432     45    12/10/12     734464.36             0        54.47       4.000   
6433     45  19-10-2012     718125.53             0        56.47       3.969   
6434     45  26-10-2012     760281.43             0        58.85       3.882   

             CPI  Unemployment  
0     211.096358         8.106  
1     211.242170         8.106  
2     211.289143         8.106  
3     211.319643         8.106  
4     211.350143         8.106  
...          ...           ...  
6430  192.013558         8.684  
6431  192.170412         8.667  
6432  192.327265         8.667  
6433  192.330854         8.667  
6434  192.308899         8.667  

[6435 rows x 8 columns]
Number of rows: 6435
Number of missing rows: 0
Number of missing columns: 0
In [6]:
from datetime import datetime

def convert_date(date_string):
    try:
        date = datetime.strptime(date_string, "%d/%m/%y")
    except ValueError:
        date = datetime.strptime(date_string, "%d-%m-%Y")
    return date.strftime("%d-%m-%Y")

# apply conversion function to Date column
df["Date"] = df["Date"].apply(convert_date)

print(df)
      Store        Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
0         1  05-02-2010    1643690.90             0        42.31       2.572   
1         1  12-02-2010    1641957.44             1        38.51       2.548   
2         1  19-02-2010    1611968.17             0        39.93       2.514   
3         1  26-02-2010    1409727.59             0        46.63       2.561   
4         1  05-03-2010    1554806.68             0        46.50       2.625   
...     ...         ...           ...           ...          ...         ...   
6430     45  28-09-2012     713173.95             0        64.88       3.997   
6431     45  05-10-2012     733455.07             0        64.89       3.985   
6432     45  12-10-2012     734464.36             0        54.47       4.000   
6433     45  19-10-2012     718125.53             0        56.47       3.969   
6434     45  26-10-2012     760281.43             0        58.85       3.882   

             CPI  Unemployment  
0     211.096358         8.106  
1     211.242170         8.106  
2     211.289143         8.106  
3     211.319643         8.106  
4     211.350143         8.106  
...          ...           ...  
6430  192.013558         8.684  
6431  192.170412         8.667  
6432  192.327265         8.667  
6433  192.330854         8.667  
6434  192.308899         8.667  

[6435 rows x 8 columns]
In [7]:
import matplotlib.pyplot as plt
import scipy.stats as stats

# generate qqplot for Weekly_Sales using the converted data
fig, ax = plt.subplots()
stats.probplot(df['Weekly_Sales'], dist='norm', plot=ax)
ax.set_title('Q-Q plot for Weekly_Sales')
ax.set_xlabel('Theoretical quantiles')
ax.set_ylabel('Sample quantiles')
plt.show()
In [8]:
import matplotlib.pyplot as plt

# Aggregating data by 'Store' and Finding sum of 'Weekly_Sales'
Store_Sales = df.groupby(['Store']).sum()

# Changing column name of sales
Store_Sales.rename(columns={'Weekly_Sales': 'Total_Sales_by_Store'}, inplace=True)

# Finding out Store with highest Sales
Store_Sales = Store_Sales.sort_values(by=['Total_Sales_by_Store'], ascending=False)

# Choosing the first store that comes in this order
max_store = Store_Sales.index[0]
max_sales = Store_Sales.iloc[0]['Total_Sales_by_Store']

# Printing the output
print(f"Store no {max_store} has the maximum sales and the value is {max_sales}")

# Plotting the results
plt.bar(Store_Sales.index, Store_Sales['Total_Sales_by_Store'])
plt.title('Total Sales by Store')
plt.xlabel('Store')
plt.ylabel('Total Sales')
plt.show()
Store no 20 has the maximum sales and the value is 301397792.46
In [9]:
#Weekly sales average
weekly_sales = df[['Date', 'Weekly_Sales']].groupby('Date', as_index=False).mean()
weekly_sales['Date'] = pd.to_datetime(weekly_sales['Date'], format="%d-%m-%Y")
weekly_sales = weekly_sales.sort_values('Date').reset_index(drop=True)
weekly_sales['Date'] = pd.Categorical(weekly_sales['Date'])

#plotting weekly mean sales
plt.figure(figsize=(14,8))
b = sns.lineplot(data=weekly_sales, x="Date", y="Weekly_Sales")
b.set(xticklabels=weekly_sales['Date'][::10])
b.set_xticklabels(labels=weekly_sales['Date'][::10], rotation=90, ha='right')
plt.xlabel('Week')
plt.ylabel('Mean Sales of Week')
plt.title('Weekly Mean Sales')
plt.show()

# Finding the highest and lowest mean sales
highest_mean = weekly_sales.loc[weekly_sales['Weekly_Sales'].idxmax()]
lowest_mean = weekly_sales.loc[weekly_sales['Weekly_Sales'].idxmin()]

print(f"Highest Mean Sales: {highest_mean['Weekly_Sales']:.2f} on {highest_mean['Date'].strftime('%d-%m-%Y')}")
print(f"Lowest Mean Sales: {lowest_mean['Weekly_Sales']:.2f} on {lowest_mean['Date'].strftime('%d-%m-%Y')}")
Highest Mean Sales: 1798475.90 on 24-12-2010
Lowest Mean Sales: 879996.73 on 28-01-2011
In [10]:
#Holiday and Noholiday sales
df['Date'] = pd.to_datetime(df['Date'])
SuperBowl = pd.to_datetime(['12-02-2010', '11-02-2011', '10-02-2012', '08-02-2013'])
LabourDay = pd.to_datetime(['10-09-2010', '09-09-2011', '07-09-2012', '06-09-2013'])
Thanksgiving = pd.to_datetime(['26-11-2010','25-11-2011', '23-11-2012', '29-11-2013'])
Christmas = pd.to_datetime(['31-12-2010', '30-12-2011', '28-12-2012', '27-12-2013'])

walmart_h = df[['Date', 'Weekly_Sales']]

walmart_h['hflag'] = np.where(walmart_h['Date'].isin(SuperBowl), 'SB',
np.where(walmart_h['Date'].isin(LabourDay), 'LD',
np.where(walmart_h['Date'].isin(Thanksgiving), 'TG',
np.where(walmart_h['Date'].isin(Christmas), 'CH', 'None'))))
pd.options.display.float_format = '{:.0f}'.format

walmart_h_mean = walmart_h.groupby('hflag').mean()
print(walmart_h_mean)
       Weekly_Sales
hflag              
CH           960833
LD          1042427
None        1041256
SB          1079128
TG          1471273
In [11]:
df.columns
Out[11]:
Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment'],
      dtype='object')
In [12]:
# Relabeling the columns in the dataset

df.Date=pd.to_datetime(df.Date)

df['weekday'] = df.Date.dt.weekday
df['month'] = df.Date.dt.month
df['year'] = df.Date.dt.year

# df['Monthly_Quarter'] = df.month.map({1:'Q1',2:'Q1',3:'Q1',4:'Q2',5:'Q2',6:'Q2',7:'Q3',
#                                       8:'Q3',9:'Q3',10:'Q4',11:'Q4',12:'Q4'})

df.drop(['Date'], axis=1, inplace=True)#,'month'

target = 'Weekly_Sales'
features = [i for i in df.columns if i not in [target]]
original_df = df.copy(deep=True)

df.head()
Out[12]:
Store Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment weekday month year
0 1 1643691 0 42 3 211 8 6 5 2010
1 1 1641957 1 39 3 211 8 3 12 2010
2 1 1611968 0 40 3 211 8 4 2 2010
3 1 1409728 0 47 3 211 8 4 2 2010
4 1 1554807 0 46 3 211 8 0 5 2010
In [13]:
df.columns
Out[13]:
Index(['Store', 'Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price',
       'CPI', 'Unemployment', 'weekday', 'month', 'year'],
      dtype='object')
In [14]:
#Checking the data types of all the columns
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6435 entries, 0 to 6434
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Weekly_Sales  6435 non-null   float64
 2   Holiday_Flag  6435 non-null   int64  
 3   Temperature   6435 non-null   float64
 4   Fuel_Price    6435 non-null   float64
 5   CPI           6435 non-null   float64
 6   Unemployment  6435 non-null   float64
 7   weekday       6435 non-null   int64  
 8   month         6435 non-null   int64  
 9   year          6435 non-null   int64  
dtypes: float64(5), int64(5)
memory usage: 553.0 KB
In [15]:
#Checking number of unique rows in each feature
df.nunique().sort_values()
Out[15]:
Holiday_Flag       2
year               3
weekday            7
month             12
Store             45
Unemployment     349
Fuel_Price       892
CPI             2145
Temperature     3528
Weekly_Sales    6435
dtype: int64
In [16]:
#Checking for the number of unique rows in each variable

nu = df[features].nunique().sort_values()
nf = []; cf = []; nnf = 0; ncf = 0; #categorical and numerical variables 

for i in range(df[features].shape[1]):
    if nu.values[i]<=45:cf.append(nu.index[i])
    else: nf.append(nu.index[i])

print('\n\033[1mInference:\033[0m The Datset contains {} numerical & {} categorical features.'.format(len(nf),len(cf)))
      
Inference: The Datset contains 4 numerical & 5 categorical features.
In [17]:
#Checking the stats of all the columns
display(df.describe())
Store Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment weekday month year
count 6435 6435 6435 6435 6435 6435 6435 6435 6435 6435
mean 23 1046965 0 61 3 172 8 4 6 2011
std 13 564367 0 18 0 39 2 1 3 1
min 1 209986 0 -2 2 126 4 0 1 2010
25% 12 553350 0 47 3 132 7 4 4 2010
50% 23 960746 0 63 3 183 8 4 6 2011
75% 34 1420159 0 75 4 213 9 4 9 2012
max 45 3818686 1 100 4 227 14 6 12 2012
In [18]:
#Analyzing the distribution of the target variable
plt.figure(figsize=[8,4])
sns.distplot(df[target], color='g',hist_kws=dict(edgecolor="black", linewidth=2), bins=30)
plt.title('Target Variable Distribution - Median Value of Sales ($1Ms)')
plt.show()
In [19]:
#Visualisation of the categorical features 

print('\033[1mVisualising the Categorical Features:'.center(100))

n=2
plt.figure(figsize=[15,3*math.ceil(len(cf)/n)])

for i in range(len(cf)):
    if df[cf[i]].nunique()<=8:
        plt.subplot(math.ceil(len(cf)/n),n,i+1)
        sns.countplot(df[cf[i]])
    else:
        plt.subplot(3,1,i-1)
        sns.countplot(df[cf[i]])
        
plt.tight_layout()
plt.show()
                             Visualising the Categorical Features:                              
In [20]:
#Visualisation of the numeric variables
print('\033[1mNumeric variables Distribution'.center(130))

n=4
clr=['r','g','b','g','b','r']

plt.figure(figsize=[15,6*math.ceil(len(nf)/n)])
for i in range(len(nf)):
    plt.subplot(math.ceil(len(nf)/3),n,i+1)
    sns.distplot(df[nf[i]],hist_kws=dict(edgecolor="black", 
    linewidth=2), bins=10, color=list(np.random.randint([255,255,255])/255))
plt.tight_layout()
plt.show()

plt.figure(figsize=[15,6*math.ceil(len(nf)/n)])
for i in range(len(nf)):
    plt.subplot(math.ceil(len(nf)/3),n,i+1)
    df.boxplot(nf[i])
plt.tight_layout()
plt.show()
                                                Numeric variables Distribution                                                
In [21]:
#Understanding the relationship between all the variables in the dataset
g = sns.pairplot(df)
plt.title('Pairplots for all the Variables')
g.map_upper(sns.kdeplot, levels=4, color=".2")
plt.show()
In [22]:
#Check for empty elements
nvc = pd.DataFrame(df.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/df.shape[0],3)*100
print(nvc)
              Total Null Values  Percentage
Store                         0           0
Weekly_Sales                  0           0
Holiday_Flag                  0           0
Temperature                   0           0
Fuel_Price                    0           0
CPI                           0           0
Unemployment                  0           0
weekday                       0           0
month                         0           0
year                          0           0
In [23]:
#Converting categorical variables to Numeric variables

df3 = df.copy()

ecc = nvc[nvc['Percentage']!=0].index.values
fcc = [i for i in cf if i not in ecc]

#One-Hot Binary Encoding
oh=True
dm=True
for i in fcc:
    #print(i)
    if df3[i].nunique()==2:
        if oh==True: print("\033[1mOne-Hot Encoding on variables:\033[0m")
        print(i);oh=False
        df3[i]=pd.get_dummies(df3[i], drop_first=True, prefix=str(i))
    if (df3[i].nunique()>2):
        if dm==True: print("\n\033[1mDummy Encoding on variables:\033[0m")
        print(i);dm=False
        df3 = pd.concat([df3.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df3[i], 
                drop_first=True, prefix=str(i)))],axis=1)
        
df3.shape
One-Hot Encoding on variables:
Holiday_Flag

Dummy Encoding on variables:
year
weekday
month
Store
Out[23]:
(6435, 69)
In [24]:
#Removal of outliers:

df1 = df3.copy()

#features1 = [i for i in features if i not in ['RAD','CHAS']]
features1 = nf

for i in features1:
    Q1 = df1[i].quantile(0.25)
    Q3 = df1[i].quantile(0.75)
    IQR = Q3 - Q1
    df1 = df1[df1[i] <= (Q3+(1.5*IQR))]
    df1 = df1[df1[i] >= (Q1-(1.5*IQR))]
    df1 = df1.reset_index(drop=True)
display(df1.head())
print('\n\033[1mInference:\033[0m\nBefore removal of outliers, the dataset contains {} samples.'
      .format(df3.shape[0]))
print('After removal of outliers, the dataset contains {} samples.'.format(df1.shape[0]))
Weekly_Sales Holiday_Flag Temperature Fuel_Price CPI Unemployment year_2011 year_2012 weekday_1 weekday_2 ... Store_36 Store_37 Store_38 Store_39 Store_40 Store_41 Store_42 Store_43 Store_44 Store_45
0 1643691 0 42 3 211 8 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1641957 1 39 3 211 8 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1611968 0 40 3 211 8 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 1409728 0 47 3 211 8 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1554807 0 46 3 211 8 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 69 columns

Inference:
Before removal of outliers, the dataset contains 6435 samples.
After removal of outliers, the dataset contains 5953 samples.
In [25]:
#Final Dataset size after performing Preprocessing

df = df1.copy()
df.columns=[i.replace('-','_') for i in df.columns]

plt.title('Final Dataset')
plt.pie([df.shape[0], original_df.shape[0]-df.shape[0]], radius = 1, labels=['Retained','Dropped'], counterclock=False, 
        autopct='%1.1f%%', pctdistance=0.9, explode=[0,0], shadow=True)
plt.pie([df.shape[0]], labels=['100%'], labeldistance=-0, radius=0.78)
plt.show()
In [26]:
#Visualizing the correlation matrix

print('\033[1mCorrelation Matrix'.center(100))
plt.figure(figsize=[25,20])
sns.heatmap(df.corr(), annot=True, fmt='.2f', vmin=-1, vmax=1, center=0) #cmap='BuGn'
plt.show()
                                       Correlation Matrix                                       
In [36]:
#Splitting the data into training and testing sets

m=[]
for i in df.columns.values:
    m.append(i.replace(' ','_'))
    
df.columns = m
X = df.drop([target],axis=1)
Y = df[target]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=100)
Train_X.reset_index(drop=True,inplace=True)

print('Original data set ',X.shape,Y.shape,'\nTraining data set ',
      Train_X.shape,Train_Y.shape,'\nTesting data set ', Test_X.shape,'', Test_Y.shape)
Original data set  (5953, 68) (5953,) 
Training data set  (4167, 68) (4167,) 
Testing data set  (1786, 68)  (1786,)
In [37]:
#Performing Standardization
std = StandardScaler()

print('\033[1mStandardardization on Training data set'.center(120))
Train_X_std = std.fit_transform(Train_X)
Train_X_std = pd.DataFrame(Train_X_std, columns=X.columns)
display(Train_X_std.describe())

print('\n','\033[1mStandardardization on Testing data set'.center(120))
Test_X_std = std.transform(Test_X)
Test_X_std = pd.DataFrame(Test_X_std, columns=X.columns)
display(Test_X_std.describe())
                                      Standardardization on Training data set                                       
Holiday_Flag Temperature Fuel_Price CPI Unemployment year_2011 year_2012 weekday_1 weekday_2 weekday_3 ... Store_36 Store_37 Store_38 Store_39 Store_40 Store_41 Store_42 Store_43 Store_44 Store_45
count 4167 4167 4167 4167 4167 4167 4167 4167 4167 4167 ... 4167 4167 4167 4167 4167 4167 4167 4167 4167 4167
mean 0 0 -0 -0 -0 0 -0 -0 0 0 ... 0 -0 -0 0 -0 0 -0 -0 0 -0
std 1 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1
min -0 -3 -2 -1 -3 -1 -1 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
25% -0 -1 -1 -1 -1 -1 -1 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
50% -0 0 0 0 0 -1 -1 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
75% -0 1 1 1 1 1 2 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
max 4 2 2 1 3 1 2 4 9 4 ... 6 6 23 6 8 7 6 6 7 7

8 rows × 68 columns

                                        Standardardization on Testing data set                                       
Holiday_Flag Temperature Fuel_Price CPI Unemployment year_2011 year_2012 weekday_1 weekday_2 weekday_3 ... Store_36 Store_37 Store_38 Store_39 Store_40 Store_41 Store_42 Store_43 Store_44 Store_45
count 1786 1786 1786 1786 1786 1786 1786 1786 1786 1786 ... 1786 1786 1786 1786 1786 1786 1786 1786 1786 1786
mean 0 0 0 0 -0 0 -0 0 0 0 ... 0 -0 0 0 0 0 -0 -0 0 0
std 1 1 1 1 1 1 1 1 1 1 ... 1 1 2 1 1 1 1 1 1 1
min -0 -3 -2 -1 -3 -1 -1 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
25% -0 -1 -1 -1 -1 -1 -1 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
50% -0 0 0 0 0 -1 -1 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
75% -0 1 1 1 1 1 2 -0 -0 -0 ... -0 -0 -0 -0 -0 -0 -0 -0 -0 -0
max 4 2 2 1 3 1 2 4 9 4 ... 6 6 23 6 8 7 6 6 7 7

8 rows × 68 columns

In [38]:
#Testing a Linear Regression model with statsmodels

Train_xy = pd.concat([Train_X_std,Train_Y.reset_index(drop=True)],axis=1)
a = Train_xy.columns.values

API = api.ols(formula='{} ~ {}'.format(target,' + '.join(i for i in Train_X.columns)), data=Train_xy).fit()
#print(API.conf_int())
#print(API.pvalues)
API.summary()
Out[38]:
OLS Regression Results
Dep. Variable: Weekly_Sales R-squared: 0.931
Model: OLS Adj. R-squared: 0.930
Method: Least Squares F-statistic: 819.4
Date: Mon, 27 Mar 2023 Prob (F-statistic): 0.00
Time: 17:01:33 Log-Likelihood: -55576.
No. Observations: 4167 AIC: 1.113e+05
Df Residuals: 4098 BIC: 1.117e+05
Df Model: 68
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 1.051e+06 2342.620 448.804 0.000 1.05e+06 1.06e+06
Holiday_Flag 4498.6227 2653.683 1.695 0.090 -704.037 9701.282
Temperature -1.108e+04 3914.719 -2.829 0.005 -1.88e+04 -3400.650
Fuel_Price 7066.0594 6618.561 1.068 0.286 -5909.915 2e+04
CPI 5.058e+05 7.09e+04 7.133 0.000 3.67e+05 6.45e+05
Unemployment -6.351e+04 7946.015 -7.993 0.000 -7.91e+04 -4.79e+04
year_2011 -3.383e+04 6928.894 -4.882 0.000 -4.74e+04 -2.02e+04
year_2012 -7.531e+04 9641.447 -7.811 0.000 -9.42e+04 -5.64e+04
weekday_1 6114.1130 3290.061 1.858 0.063 -336.193 1.26e+04
weekday_2 -1.125e+04 2766.375 -4.067 0.000 -1.67e+04 -5826.151
weekday_3 -1.331e+04 3300.944 -4.032 0.000 -1.98e+04 -6839.216
weekday_4 -1.538e+04 4407.517 -3.491 0.000 -2.4e+04 -6743.868
weekday_5 -1.239e+04 3050.293 -4.062 0.000 -1.84e+04 -6409.791
weekday_6 -1601.3677 3340.154 -0.479 0.632 -8149.883 4947.148
month_2 2.829e+04 3450.045 8.201 0.000 2.15e+04 3.51e+04
month_3 1.983e+04 3515.224 5.642 0.000 1.29e+04 2.67e+04
month_4 2.002e+04 3735.968 5.358 0.000 1.27e+04 2.73e+04
month_5 2.055e+04 3749.128 5.482 0.000 1.32e+04 2.79e+04
month_6 3.053e+04 3528.249 8.654 0.000 2.36e+04 3.75e+04
month_7 1.788e+04 3765.347 4.750 0.000 1.05e+04 2.53e+04
month_8 2.283e+04 3647.472 6.259 0.000 1.57e+04 3e+04
month_9 1.021e+04 3735.982 2.734 0.006 2887.962 1.75e+04
month_10 1.466e+04 3810.604 3.846 0.000 7186.422 2.21e+04
month_11 4.165e+04 3511.127 11.863 0.000 3.48e+04 4.85e+04
month_12 6.25e+04 3845.898 16.252 0.000 5.5e+04 7e+04
Store_2 5.856e+04 3299.068 17.750 0.000 5.21e+04 6.5e+04
Store_3 -1.87e+05 3424.808 -54.595 0.000 -1.94e+05 -1.8e+05
Store_4 2.234e+05 2.29e+04 9.768 0.000 1.79e+05 2.68e+05
Store_5 -1.932e+05 3437.689 -56.200 0.000 -2e+05 -1.86e+05
Store_6 -1.019e+04 3446.121 -2.958 0.003 -1.7e+04 -3438.411
Store_7 -9.879e+04 6705.936 -14.731 0.000 -1.12e+05 -8.56e+04
Store_8 -1.212e+05 3772.879 -32.129 0.000 -1.29e+05 -1.14e+05
Store_9 -1.641e+05 3569.151 -45.972 0.000 -1.71e+05 -1.57e+05
Store_10 2.315e+05 2.44e+04 9.475 0.000 1.84e+05 2.79e+05
Store_11 -3.924e+04 3365.721 -11.658 0.000 -4.58e+04 -3.26e+04
Store_12 4.479e+04 9737.960 4.600 0.000 2.57e+04 6.39e+04
Store_13 2.298e+05 2.4e+04 9.578 0.000 1.83e+05 2.77e+05
Store_14 1.408e+05 8886.353 15.842 0.000 1.23e+05 1.58e+05
Store_15 1.675e+04 2.26e+04 0.742 0.458 -2.75e+04 6.1e+04
Store_16 -1.261e+05 7116.715 -17.724 0.000 -1.4e+05 -1.12e+05
Store_17 6.131e+04 2.5e+04 2.453 0.014 1.23e+04 1.1e+05
Store_18 9.59e+04 2.3e+04 4.171 0.000 5.08e+04 1.41e+05
Store_19 1.41e+05 2.23e+04 6.331 0.000 9.73e+04 1.85e+05
Store_20 9.556e+04 3833.227 24.929 0.000 8.8e+04 1.03e+05
Store_21 -1.244e+05 3306.298 -37.632 0.000 -1.31e+05 -1.18e+05
Store_22 7.636e+04 2.21e+04 3.449 0.001 3.3e+04 1.2e+05
Store_23 9.665e+04 2.03e+04 4.772 0.000 5.69e+04 1.36e+05
Store_24 1.334e+05 2.26e+04 5.912 0.000 8.91e+04 1.78e+05
Store_25 -1.25e+05 3890.176 -32.138 0.000 -1.33e+05 -1.17e+05
Store_26 7.587e+04 2.29e+04 3.319 0.001 3.1e+04 1.21e+05
Store_27 1.91e+05 2.18e+04 8.745 0.000 1.48e+05 2.34e+05
Store_28 5.208e+04 8700.647 5.986 0.000 3.5e+04 6.91e+04
Store_29 1.971e+04 2.34e+04 0.843 0.399 -2.61e+04 6.56e+04
Store_30 -1.756e+05 3307.046 -53.085 0.000 -1.82e+05 -1.69e+05
Store_31 -2.156e+04 3191.610 -6.756 0.000 -2.78e+04 -1.53e+04
Store_32 -8966.6532 6916.297 -1.296 0.195 -2.25e+04 4593.044
Store_33 -1.93e+04 2.46e+04 -0.784 0.433 -6.75e+04 2.9e+04
Store_34 9.881e+04 2.42e+04 4.082 0.000 5.14e+04 1.46e+05
Store_35 6.249e+04 2.1e+04 2.969 0.003 2.12e+04 1.04e+05
Store_36 -1.758e+05 3286.652 -53.497 0.000 -1.82e+05 -1.69e+05
Store_37 -1.579e+05 3341.738 -47.236 0.000 -1.64e+05 -1.51e+05
Store_38 1.107e+04 7505.460 1.474 0.140 -3649.149 2.58e+04
Store_39 -1.049e+04 3284.031 -3.194 0.001 -1.69e+04 -4050.460
Store_40 3.619e+04 1.85e+04 1.951 0.051 -172.130 7.25e+04
Store_41 -7639.6166 6874.380 -1.111 0.266 -2.11e+04 5837.901
Store_42 2.521e+04 2.5e+04 1.008 0.314 -2.38e+04 7.42e+04
Store_43 -1.086e+05 4576.890 -23.730 0.000 -1.18e+05 -9.96e+04
Store_44 -2.663e+04 2.4e+04 -1.111 0.267 -7.36e+04 2.04e+04
Store_45 -5.128e+04 8473.219 -6.052 0.000 -6.79e+04 -3.47e+04
Omnibus: 2974.895 Durbin-Watson: 2.000
Prob(Omnibus): 0.000 Jarque-Bera (JB): 97387.972
Skew: 3.002 Prob(JB): 0.00
Kurtosis: 25.910 Cond. No. 80.6


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [29]:
from sklearn.preprocessing import PolynomialFeatures
Trr=[]; Tss=[]; n=3
order=['ord-'+str(i) for i in range(2,n)]
#Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
#Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order)

DROP=[];b=[]

for i in range(len(Train_X_std.columns)):
    vif = pd.DataFrame()
    X = Train_X_std.drop(DROP,axis=1)
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    vif.reset_index(drop=True, inplace=True)
    if vif.loc[0][1]>1:
        DROP.append(vif.loc[0][0])
        LR = LinearRegression()
        LR.fit(Train_X_std.drop(DROP,axis=1), Train_Y)

        pred1 = LR.predict(Train_X_std.drop(DROP,axis=1))
        pred2 = LR.predict(Test_X_std.drop(DROP,axis=1))
        
        Trr.append(np.sqrt(mean_squared_error(Train_Y, pred1)))
        Tss.append(np.sqrt(mean_squared_error(Test_Y, pred2)))

print('Dropped Features --> ',DROP)

undropped_columns = list(set(Train_X_std.columns) - set(DROP))
print("Undropped columns:", undropped_columns)

plt.plot(Trr, label='Train RMSE')
plt.plot(Tss, label='Test RMSE')

plt.legend()
plt.grid()
plt.show()
Dropped Features -->  ['CPI', 'Unemployment', 'Fuel_Price', 'weekday_4', 'month_7', 'Store_17', 'Temperature', 'month_12', 'year_2012', 'Store_29', 'month_2', 'Store_2', 'month_11', 'Store_27', 'month_5', 'Store_16', 'Store_18', 'month_10', 'Store_22', 'Holiday_Flag', 'year_2011', 'Store_33', 'Store_21', 'month_9', 'Store_19', 'Store_25', 'Store_15', 'Store_14', 'Store_37', 'month_4', 'Store_20', 'Store_24', 'Store_13', 'Store_45', 'Store_44', 'Store_43', 'weekday_5', 'month_8', 'Store_42', 'Store_11', 'Store_41', 'Store_10', 'weekday_3', 'Store_6', 'weekday_1', 'weekday_2', 'Store_26', 'Store_23', 'Store_40', 'Store_39', 'Store_35', 'Store_9', 'month_3', 'Store_3', 'Store_4', 'Store_8']
Undropped columns: ['Store_7', 'Store_38', 'month_6', 'Store_30', 'Store_32', 'Store_28', 'Store_34', 'weekday_6', 'Store_12', 'Store_5', 'Store_36', 'Store_31']
In [40]:
from sklearn.preprocessing import PolynomialFeatures
Trr=[]; Tss=[]; n=3
order=['ord-'+str(i) for i in range(2,n)]
Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order)

m=df.shape[1]-2
for i in range(m):
    lm = LinearRegression()
    rfe = RFE(lm,n_features_to_select=Train_X_std.shape[1]-i)             
    rfe = rfe.fit(Train_X_std, Train_Y)

    LR = LinearRegression()
    LR.fit(Train_X_std.loc[:,rfe.support_], Train_Y)

    pred1 = LR.predict(Train_X_std.loc[:,rfe.support_])
    pred2 = LR.predict(Test_X_std.loc[:,rfe.support_])

    Trr.append(np.sqrt(mean_squared_error(Train_Y, pred1)))
    Tss.append(np.sqrt(mean_squared_error(Test_Y, pred2)))
    
plt.plot(Trr, label='Train RMSE')
plt.plot(Tss, label='Test RMSE')

plt.legend()
plt.grid()
plt.show()
In [41]:
#Shortlisting the selected Features (with RFE)

lm = LinearRegression()
rfe = RFE(lm,n_features_to_select=Train_X_std.shape[1]-28)
rfe = rfe.fit(Train_X_std, Train_Y)

LR = LinearRegression()
LR.fit(Train_X_std.loc[:,rfe.support_], Train_Y)

print(Train_X_std.loc[:,rfe.support_].columns)

pred1 = LR.predict(Train_X_std.loc[:,rfe.support_])
pred2 = LR.predict(Test_X_std.loc[:,rfe.support_])

print(np.sqrt(mean_squared_error(Train_Y, pred1)))
print(np.sqrt(mean_squared_error(Test_Y, pred2)))

Train_X_std = Train_X_std.loc[:,rfe.support_]
Test_X_std = Test_X_std.loc[:,rfe.support_]
Index(['CPI', 'Unemployment', 'year_2011', 'year_2012', 'month_12', 'Store_2',
       'Store_3', 'Store_4', 'Store_5', 'Store_7', 'Store_8', 'Store_9',
       'Store_10', 'Store_12', 'Store_13', 'Store_14', 'Store_15', 'Store_16',
       'Store_17', 'Store_18', 'Store_19', 'Store_20', 'Store_21', 'Store_22',
       'Store_23', 'Store_24', 'Store_25', 'Store_26', 'Store_27', 'Store_28',
       'Store_29', 'Store_30', 'Store_34', 'Store_35', 'Store_36', 'Store_37',
       'Store_40', 'Store_42', 'Store_43', 'Store_45'],
      dtype='object')
159874.46675185006
156505.19663797048
In [46]:
#Multiple Linear Regression

MLR = LinearRegression().fit(Train_X_std,Train_Y)
pred1 = MLR.predict(Train_X_std)
pred2 = MLR.predict(Test_X_std)

print('The Intercept of the Regresion Model was found to be',MLR.intercept_)

coefficients = pd.Series(MLR.coef_, index=Train_X_std.columns)
print('The Coefficients of the Regression Model are found to be:\n', coefficients)

Evaluate(0, pred1, pred2)
The Intercept of the Regresion Model was found to be 1051376.4668682504
The Coefficients of the Regression Model are found to be:
 CPI             519675
Unemployment    -41407
year_2011       -30764
year_2012       -66636
month_12         43592
Store_2          72217
Store_3        -172131
Store_4         244683
Store_5        -176496
Store_7         -84069
Store_8        -102505
Store_9        -147021
Store_10        248305
Store_12         47520
Store_13        250766
Store_14        154549
Store_15         36362
Store_16       -106276
Store_17         85044
Store_18        113528
Store_19        160259
Store_20        112408
Store_21       -110596
Store_22         96050
Store_23        121077
Store_24        150807
Store_25       -107991
Store_26         96052
Store_27        209716
Store_28         53792
Store_29         34176
Store_30       -161345
Store_34        111658
Store_35         77974
Store_36       -163099
Store_37       -145148
Store_40         58120
Store_42         42934
Store_43       -100386
Store_45        -38329
dtype: float64
In [52]:
#Evaluating the models

Model_Evaluation_Comparison_Matrix = pd.DataFrame(np.zeros([5,8]), 
                            columns=['Train-R2','Test-R2','Train-RSS','Test-RSS',
                            'Train-MSE','Test-MSE','Train-RMSE','Test-RMSE'])
rc=np.random.choice(Train_X_std.loc[:,Train_X_std.nunique()>=50].columns.values,2,replace=False)
def Evaluate(n, pred1,pred2):
    #Plotting predicted predicteds alongside the actual datapoints 
    plt.figure(figsize=[15,6])
    for e,i in enumerate(rc):
        plt.subplot(2,3,e+1)
        plt.scatter(y=Train_Y, x=Train_X_std[i], label='Actual')
        plt.scatter(y=pred1, x=Train_X_std[i], label='Prediction')
        plt.legend()
    plt.show()
    
#Evaluating the results of Multiple Linear Regression Model

print('\n\n{}Training Set Metrics{}'.format('-'*20, '-'*20))
print('\nR2-Score on Training set --->',round(r2_score(Train_Y, pred1),20))
print('Residual Sum of Squares (RSS) on Training set  --->',round(np.sum(np.square(Train_Y-pred1)),20))
print('Mean Squared Error (MSE) on Training set       --->',round(mean_squared_error(Train_Y, pred1),20))
print('Root Mean Squared Error (RMSE) on Training set --->',round(np.sqrt(mean_squared_error(Train_Y, pred1)),20))

print('\n{}Testing Set Metrics{}'.format('-'*20, '-'*20))
print('\nR2-Score on Testing set --->',round(r2_score(Test_Y, pred2),20))
print('Residual Sum of Squares (RSS) on Testing set  --->',round(np.sum(np.square(Test_Y-pred2)),20))
print('Mean Squared Error (MSE) on Testing set       --->',round(mean_squared_error(Test_Y, pred2),20))
print('Root Mean Squared Error (RMSE) on Testing set --->',round(np.sqrt(mean_squared_error(Test_Y, pred2)),20))
print('\n{}Residual Plots{}'.format('-'*20, '-'*20))
    
Model_Evaluation_Comparison_Matrix.loc[n,'Train-R2']  = round(r2_score(Train_Y, pred1),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-R2']   = round(r2_score(Test_Y, pred2),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Train-RSS'] = round(np.sum(np.square(Train_Y-pred1)),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-RSS']  = round(np.sum(np.square(Test_Y-pred2)),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Train-MSE'] = round(mean_squared_error(Train_Y, pred1),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-MSE']  = round(mean_squared_error(Test_Y, pred2),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Train-RMSE']= round(np.sqrt(mean_squared_error(Train_Y, pred1)),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-RMSE'] = round(np.sqrt(mean_squared_error(Test_Y, pred2)),20)

# Plotting y_test and y_pred and understanding the spread.
plt.figure(figsize=[15,4])
plt.subplot(1,2,1)
sns.distplot((Train_Y - pred1))
plt.title('Error Terms')          
plt.xlabel('Errors') 

plt.subplot(1,2,2)
plt.scatter(Train_Y,pred1)
plt.plot([Train_Y.min(),Train_Y.max()],[Train_Y.min(),Train_Y.max()], 'r--')
plt.title('Test data vs Prediction')         
plt.xlabel('y_test')                       
plt.ylabel('y_pred')                       
plt.show()

--------------------Training Set Metrics--------------------

R2-Score on Training set ---> 0.9221377512636295
Residual Sum of Squares (RSS) on Training set  ---> 106507874611658.14
Mean Squared Error (MSE) on Training set       ---> 25559845119.188416
Root Mean Squared Error (RMSE) on Training set ---> 159874.46675185006

--------------------Testing Set Metrics--------------------

R2-Score on Testing set ---> 0.9246903759637972
Residual Sum of Squares (RSS) on Testing set  ---> 43746063562396.0
Mean Squared Error (MSE) on Testing set       ---> 24493876574.689808
Root Mean Squared Error (RMSE) on Testing set ---> 156505.19663797048

--------------------Residual Plots--------------------
In [59]:
# Save the train and test datasets as CSV files in the specified directory
Train_X.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/train_X.csv', index=False)
Test_X.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/test_X.csv', index=False)
Train_Y.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/train_Y.csv', index=False)
Test_Y.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/test_Y.csv', index=False)
df.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/df.csv', index=False)
In [ ]: